Install relevant R packages:
source("http://bioconductor.org/biocLite.R")
biocLite(c("supraHex","devtools","dnet","XGR","dplyr"))
devtools::install_github(c("hfang-bristol/supraHex", "hfang-bristol/dnet", "hfang-bristol/XGR"))
Here we illustrate the power of I3 in interpreting eGenes found in 10 brain subregions (and the whole blood as comparisons) with respect to their selective pressure against mutations. In brief, GTEx V6p tissue-specific eGene datasets are used as input data for training and ExAC loss-of-function (LoF) datasets used as additional data for integration.
Overview of the workflow:
First of all, load the packages:
library(supraHex)
library(XGR)
# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"
Then, import GTEx and ExAC datasets:
# import GTEx datasets
GTEx_V6p_matrix <- xRDataLoader(RData.customised='GTEx_V6p_matrix', RData.location=RData.location)
## brain or blood tissues
ind <- grepl('Brain|Blood', colnames(GTEx_V6p_matrix))
GTEx_V6p_matrix <- GTEx_V6p_matrix[,ind]
colnames(GTEx_V6p_matrix) <- gsub('Brain_','',colnames(GTEx_V6p_matrix))
## only significant eGenes (q-value<0.05)
GTEx_V6p_matrix <- -log10(GTEx_V6p_matrix)
ind <- apply(GTEx_V6p_matrix>-log10(0.05), 1, sum)
GTEx_matrix <- GTEx_V6p_matrix[ind!=0,]
# import ExAC datasets
ExAC <- xRDataLoader(RData.customised='ExAC', RData.location=RData.location)
ExAC_matrix <- data.frame(pLI=ExAC$pLI, stringsAsFactors=F)
rownames(ExAC_matrix) <- ExAC$Symbol
## ExAC LoF intolerance genes: binary (1 for yes, 0 for no)
ExAC_matrix[ExAC_matrix>0.9,1] <- 1
ExAC_matrix[ExAC_matrix<=0.9,1] <- 0
# extract genes commons to GTEx and ExAC datasets
ind <- match(rownames(ExAC_matrix), rownames(GTEx_matrix))
G2GTEx <- GTEx_matrix[ind[!is.na(ind)],]
G2ExAC <- data.frame(pLI=ExAC_matrix[!is.na(ind),], stringsAsFactors=F)
rownames(G2ExAC) <- rownames(ExAC_matrix)[!is.na(ind)]
Justify the use of the diamond-shaped map:
data <- G2GTEx
df <- as.data.frame(stats::prcomp(data)$x[,1:2])
gp <- ggplot(df, aes(x=PC1, y=PC2))
gp <- gp + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
## 2d shape plot
myColor <- xColormap(colormap="spectral")(11)
gp <- gp + stat_binhex(bins=32) + scale_fill_gradientn(colours=myColor,trans="sqrt") + geom_density2d(colour="grey")
print(gp)
Train the diamond-shaped map using GTEx tissue eGene datasets:
data <- G2GTEx
sMg <- sPipeline(data, shape="diamond", finetuneSustain=T, scale=3)
The ExAC LoF intolerance map is obtained by overlaying the ExAC data onto the trained GTEx tissue map:
sOverlay_ExAC <- sMapOverlay(sMg, data=G2GTEx, additional=G2ExAC)
Visualise ExAC LoF intolerance map:
colormap <- xColormap("jet.top")
visHexMulComp(sOverlay_ExAC, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(0,0.4), gp=grid::gpar(cex=1), newpage=F)
Rankings of 11 tissues in terms of similarity (Pearson correlation) to the ExAC LoF intolerance map:
M_GTEx <- sMg$codebook
M_ExAC <- sOverlay_ExAC$codebook
y <- M_ExAC[,1]
names(y) <- 1:length(y)
ls_df <- lapply(1:ncol(M_GTEx), function(i){
x <- data.frame(name=1:nrow(M_GTEx), M_GTEx[,i], stringsAsFactors=F)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary <- ls_res$df_summary
data.frame(tissue=colnames(M_GTEx)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)
## polar dotplot
df_tmp <- df
df_tmp$tissue <- gsub('_','\n',df_tmp$tissue)
gp_GTEx <- xPolarDot(df_tmp, size=1.5, parallel=T, signature=F)
gp_GTEx + labs(y="Pearson correlation") + scale_y_reverse(limits=c(0,-1))
Visualise GTEx tissue map (reordered by correlation) as the landscape:
xMap <- sMg
ind <- match(df$tissue, colnames(xMap$codebook))
xMap$codebook <- xMap$codebook[,ind]
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=11:1, rect.grid=c(1,11), title.rotate=30, title.xy=c(0.3,1.05), colormap=colormap, zlim=c(0,6), gp=grid::gpar(cex=0.6), newpage=F)
Obtain gene clusters based on GTEx tissue map:
xM <- sMg
ind <- match(df$tissue, colnames(xM$codebook))
xM$codebook <- xM$codebook[,ind]
#seed <- sDmatMinima(xM)
sBase <- sDmatCluster(xM, which_neigh=1)
colormap <- xColormap("spectral")
visDmatCluster(xM, sBase, colormap=colormap, newpage=F)
Calculate the probability of genes being LoF intolerant per gene cluster:
colormap <- xColormap("jet.top")
output_ExAC <- visDmatHeatmap(xM, data=G2GTEx[,ind], sBase, base.color=xColormap("spectral"), base.separated.arg=list(col="white",lty=1,lwd=1), base.legend.location="bottomleft", colormap=colormap, zlim=c(0,6), KeyValueName=expression(-log[10]("q-value")), labRow=NA, srtCol=90, cexCol=0.6, keep.data=T)
## Polar barplot
ind <- match(output_ExAC$ID, rownames(G2ExAC))
output_ExAC$val <- G2ExAC[ind,'pLI']
#write.table(output_ExAC,file='output.GTEx.txt',sep='\t',row.names=F)
ls_tmp <- split(x=output_ExAC$val, f=output_ExAC$Cluster_base)
ls_df <- lapply(1:length(ls_tmp), function(i){
x <- ls_tmp[[i]]
y <- mean(x)
data.frame(Cluster=paste0('C',names(ls_tmp)[i]), Val=y, Num=length(x), stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
gp <- xPolarBar(df, colormap='spectral', size.name=10, size.value=2.5, parallel=F, signature=F)
gp
Perform gene set enrichment analysis for clustered genes:
ls_cluster <- split(x=output_ExAC$ID, f=output_ExAC$Cluster_base)
names(ls_cluster) <- paste0('C', names(ls_cluster))
background <- output_ExAC$ID
ontologies <- "CGL"
ls_res_CGL <- xEnricherGenesAdv(list_vec=ls_cluster, background=background, ontologies=ontologies, size.range=c(10,20000), test="hypergeo", min.overlap=5, p.tail="two-tails", RData.location=RData.location, plot=T, fdr.cutoff=0.05, displayBy="zscore")
#gp <- xEnrichForest(ls_res_CGL, top_num="auto", FDR.cutoff=0.05, signature=F)
ls_res_CGL$gp + theme(legend.title=element_text(size=8),axis.text.x=element_text(size=4))
Perform evolutionary analysis for clustered genes:
ls_cluster <- split(x=output_ExAC$ID, f=output_ExAC$Cluster_base)
names(ls_cluster) <- paste0('C', names(ls_cluster))
background <- NULL
ontologies <- "PSG"
ls_res_PSG <- xEnricherGenesAdv(list_vec=ls_cluster, background=background, ontologies=ontologies, size.range=c(10,20000), test="fisher", min.overlap=5, p.tail="two-tails", RData.location=RData.location, plot=T, fdr.cutoff=0.05, displayBy="zscore")
#gp <- xEnrichForest(ls_res_PSG, top_num="auto", FDR.cutoff=0.05, signature=F)
ls_res_PSG$gp + theme(legend.title=element_text(size=8),axis.text.x=element_text(size=4))
We here demonstrate the utility of I3 in understanding positive genetic regulators with respect to three factors: phenotypic effects, expression and LoF intolerance. That is, we use haploid genetic screen datasets as input for the training and datasets regarding phenotypic effects, expression and LoF intolerance used as additional data for integration.
Overview of the workflow:
First of all, load the packages:
library(supraHex)
library(XGR)
# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"
Then, import Haploid and define combined datasets (phenotypic effects, expression and LoF):
# import haploid datasets (only positive regulators with negative mutation index)
Haploid <- xRDataLoader(RData.customised='Haploid_regulators', RData.location=RData.location)
## all phenotypes except 'HMGCR'
ind <- grepl('HMGCR', Haploid$Phenotype)
df <- Haploid[!ind,]
df <- df[df$MI<0,]
Haploid_matrix <- as.matrix(xSparseMatrix(df[,c('Gene','Phenotype','MI')]))
# number of phenotypes per gene
vec_count <- apply(Haploid_matrix!=0, 1, sum)
# median expression
EXP <- xRDataLoader(RData.customised='Haploid_expression', RData.location=RData.location)
vec_expression <- apply(EXP, 1, median)
# ExAC LoF intolerant
ExAC <- xRDataLoader(RData.customised='ExAC', RData.location=RData.location)
ExAC_matrix <- data.frame(pLI=ExAC$pLI, stringsAsFactors=F)
rownames(ExAC_matrix) <- ExAC$Symbol
ExAC_matrix[ExAC_matrix>0.9,1] <- 1
ExAC_matrix[ExAC_matrix<=0.9,1] <- 0
vec_lof <- ExAC_matrix[,1]
names(vec_lof) <- rownames(ExAC_matrix)
# combine three above
mat <- matrix(0, nrow=nrow(Haploid_matrix), ncol=3)
rownames(mat) <- rownames(Haploid_matrix)
colnames(mat) <- c('Count','Expression','LoF')
ind <- match(rownames(mat), names(vec_count))
mat[!is.na(ind),1] <- vec_count[ind[!is.na(ind)]]
ind <- match(rownames(mat), names(vec_expression))
mat[!is.na(ind),2] <- vec_expression[ind[!is.na(ind)]]
ind <- match(rownames(mat), names(vec_lof))
mat[!is.na(ind),3] <- vec_lof[ind[!is.na(ind)]]
Combined_matrix <- mat
# extract genes commons to Haploid and Combined datasets
ind <- match(rownames(Combined_matrix), rownames(Haploid_matrix))
G2Haploid <- Haploid_matrix[ind[!is.na(ind)],]
G2Combined <- data.frame(Combined_matrix[!is.na(ind),], stringsAsFactors=F)
rownames(G2Combined) <- rownames(Combined_matrix)[!is.na(ind)]
Train the trefoil-shaped map using Haploid genetic regulator datasets:
sMh <- sPipeline(data=G2Haploid, shape="trefoil", finetuneSustain=T, scale=3)
The map is obtained by overlaying/fusing the combined datasets onto the trained Haploid protein map:
sO_Combined <- sMapOverlay(sMh, data=G2Haploid, additional=G2Combined)
Visualise overlaid phenotypic effect map:
colormap <- xColormap("jet.top")
visHexMulComp(sO_Combined, which.components=1, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(1,5), gp=grid::gpar(cex=1), newpage=F)
Visualise overlaid expression map:
colormap <- xColormap("jet.top")
visHexMulComp(sO_Combined, which.components=2, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(6,10), gp=grid::gpar(cex=1), newpage=F)
Visualise overlaid LoF map:
colormap <- xColormap("jet.top")
visHexMulComp(sO_Combined, which.components=3, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(0.3,0.7), gp=grid::gpar(cex=1), newpage=F)
Visualise Haploid protein map in 2D landscape:
sReorder <- sCompReorder(sMh, metric="euclidean")
colormap <- colorRampPalette(c("blue","#007FFF","cyan"), interpolate="spline")
visCompReorder(sMh, sReorder, title.rotate=0, title.xy=c(0.4,0.95), colormap=colormap, zlim=c(-1,0), gp=grid::gpar(cex=0.6), newpage=F)
Obtain gene clusters based on Haploid protein map:
xM <- sMh
#seed <- sDmatMinima(xM)
sBase <- sDmatCluster(xM, which_neigh=1)
colormap <- xColormap("spectral")
visDmatCluster(xM, sBase, colormap=colormap, newpage=F)
Calculate the overlaid values averaged per gene cluster:
colormap <- xColormap("jet.both")
output_Combined <- visDmatHeatmap(xM, data=G2Haploid, sBase, base.color=xColormap("spectral"), base.separated.arg=list(col="white",lty=1,lwd=1), base.legend.location="bottomleft", colormap=colormap, zlim=c(-1,1), KeyValueName=expression(-log[2]("Mutation index")), labRow=NA, reorderRow="none", srtCol=90, cexCol=0.6, keep.data=T)
## Polar barplot
ind <- match(output_Combined$ID, rownames(G2Combined))
df_tmp <- cbind(output_Combined, G2Combined[ind,])
#write.table(df_tmp,file='output.Combined.txt',sep='\t',row.names=F)
df <- as.data.frame(df_tmp %>% dplyr::mutate(Cluster=paste0('C',Cluster_base)) %>% dplyr::group_by(Cluster) %>% dplyr::summarise(Count=mean(Count), Expression=mean(Expression),LoF=mean(LoF)))
ls_gp <- lapply(2:ncol(df), function(i){
gp <- xPolarBar(df[,c(1,i)], colormap='spectral', size.name=10, size.value=2.5, parallel=F, signature=F)
gp + labs(title=colnames(df)[i])
})
gridExtra::grid.arrange(grobs=ls_gp, nrow=1, ncol=3)
Perform pathway analysis for clustered genes:
ls_cluster <- split(x=output_Combined$ID, f=output_Combined$Cluster_base)
names(ls_cluster) <- paste0('C', names(ls_cluster))
background <- NULL
ontologies <- "REACTOME"
ls_res_reactome <- xEnricherGenesAdv(list_vec=ls_cluster, background=background, ontologies=ontologies, size.range=c(20,500), test="hypergeo", min.overlap=10, p.tail="one-tail", RData.location=RData.location, plot=T, fdr.cutoff=0.01, displayBy="zscore")
#ls_res_reactome$gp + theme(legend.title=element_text(size=8),axis.text.x=element_text(size=4))
df <- ls_res_reactome$df
df <- as.data.frame(df %>% dplyr::filter(adjp < 0.01))
mat <- as.matrix(xSparseMatrix(df[,c('name','group','zscore')], columns=names(ls_cluster)))
mat[mat==0] <- NA
gp <- xHeatmap(t(mat), reorder=c("none","row","col","both")[3], colormap="#E6F598-#ABDDA4-#66C2A5-#3288BD", zlim=c(2,10), na.color='transparent', size=2.5, x.rotate=30, x.text.size=5, y.text.size=6, barheight=5, legend.title='Z-score')
gp
Perform druggable analysis for clustered genes:
ls_cluster <- split(x=output_Combined$ID, f=output_Combined$Cluster_base)
names(ls_cluster) <- paste0('C', names(ls_cluster))
background <- NULL
ontologies <- "DGIdb"
ls_res_dgidb <- xEnricherGenesAdv(list_vec=ls_cluster, background=background, ontologies=ontologies, size.range=c(10,2000), test="fisher", min.overlap=5, p.tail="one-tail", RData.location=RData.location, plot=T, fdr.cutoff=0.05, displayBy="zscore")
#ls_res_dgidb$gp + theme(legend.title=element_text(size=8),axis.text.x=element_text(size=4))
gp <- xEnrichForest(ls_res_dgidb, top_num="auto", FDR.cutoff=0.01, signature=F)
gp
#write.table(gp$data,file='output.DGIdb.txt',sep='\t',row.names=F)
Here is the output of sessionInfo():
> R version 3.4.3 (2017-11-30)
> Platform: x86_64-apple-darwin15.6.0 (64-bit)
> Running under: OS X El Capitan 10.11.6
>
> Matrix products: default
> BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
> LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
>
> locale:
> [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
>
> attached base packages:
> [1] grid stats graphics grDevices utils datasets methods
> [8] base
>
> other attached packages:
> [1] XGR_1.1.2 ggplot2_2.2.1 dnet_1.1.2 igraph_1.1.2
> [5] supraHex_1.13.3 hexbin_1.27.1 png_0.1-7 BiocStyle_2.6.1
>
> loaded via a namespace (and not attached):
> [1] Biobase_2.38.0 httr_1.3.1
> [3] bit64_0.9-7 AnnotationHub_2.10.1
> [5] network_1.13.0 shiny_1.0.5
> [7] assertthat_0.2.0 interactiveDisplayBase_1.16.0
> [9] stats4_3.4.3 blob_1.1.0
> [11] BSgenome_1.46.0 GenomeInfoDbData_0.99.1
> [13] Rsamtools_1.30.0 ggrepel_0.7.0
> [15] yaml_2.1.16 pillar_1.1.0
> [17] RSQLite_2.0 backports_1.1.1
> [19] lattice_0.20-35 glue_1.2.0
> [21] digest_0.6.12 GenomicRanges_1.30.0
> [23] XVector_0.18.0 colorspace_1.3-2
> [25] htmltools_0.3.6 httpuv_1.3.5
> [27] Matrix_1.2-12 plyr_1.8.4
> [29] XML_3.98-1.9 pkgconfig_2.0.1
> [31] misc3d_0.8-4 bookdown_0.5
> [33] zlibbioc_1.24.0 xtable_1.8-2
> [35] RCircos_1.2.0 scales_0.5.0
> [37] sna_2.4 BiocParallel_1.12.0
> [39] tibble_1.4.2 IRanges_2.12.0
> [41] SummarizedExperiment_1.8.0 BiocGenerics_0.24.0
> [43] lazyeval_0.2.1 statnet.common_4.0.0
> [45] magrittr_1.5 mime_0.5
> [47] memoise_1.1.0 evaluate_0.10.1
> [49] nlme_3.1-131 MASS_7.3-47
> [51] graph_1.56.0 BiocInstaller_1.28.0
> [53] tools_3.4.3 matrixStats_0.52.2
> [55] stringr_1.2.0 S4Vectors_0.16.0
> [57] munsell_0.4.3 DelayedArray_0.4.1
> [59] AnnotationDbi_1.40.0 bindrcpp_0.2
> [61] Biostrings_2.46.0 compiler_3.4.3
> [63] GenomeInfoDb_1.14.0 rlang_0.1.6
> [65] plot3D_1.1.1 RCurl_1.95-4.8
> [67] bitops_1.0-6 rmarkdown_1.8
> [69] gtable_0.2.0 GenomicScores_1.2.0
> [71] DBI_0.7 R6_2.2.2
> [73] GenomicAlignments_1.14.1 knitr_1.17
> [75] dplyr_0.7.4 rtracklayer_1.38.1
> [77] bit_1.1-12 bindr_0.1
> [79] rprojroot_1.2 ggnetwork_0.5.1
> [81] Rgraphviz_2.22.0 ape_5.0
> [83] stringi_1.1.6 parallel_3.4.3
> [85] Rcpp_0.12.15